dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\property_water_claims_non_cat_fs_v5.csv", header=TRUE)
Claim Partial Dependency XGB Classification dataset exported from a Python notebook
pd_gamma_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA_Severity_Gamma_PartialDependency.csv", header=TRUE)
pd_normal_dataset <- read.csv("C:\\Kate\\Research\\Property\\Data\\EDA _Severity_Normal_PartialDependency.csv", header=TRUE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.6.5 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(ggplot2)
colnames(dataset)
## [1] "modeldata_id"
## [2] "systemidstart"
## [3] "systemidend"
## [4] "cal_year"
## [5] "startdate"
## [6] "enddate"
## [7] "startdatetm"
## [8] "enddatetm"
## [9] "ecy"
## [10] "log_ecy"
## [11] "policynumber"
## [12] "policy_uniqueid"
## [13] "policyterm"
## [14] "policytype"
## [15] "effectivedate"
## [16] "expirationdate"
## [17] "policystate"
## [18] "policyform"
## [19] "persistency"
## [20] "companycd"
## [21] "carriercd"
## [22] "agency_group"
## [23] "producername"
## [24] "territory"
## [25] "risknumber"
## [26] "risktype"
## [27] "yearbuilt"
## [28] "log_yearbuilt"
## [29] "sqft"
## [30] "log_sqft"
## [31] "stories"
## [32] "roofcd"
## [33] "roofcd_encd"
## [34] "units"
## [35] "occupancycd"
## [36] "occupancy_encd"
## [37] "allperilded"
## [38] "waterded"
## [39] "protectionclass"
## [40] "constructioncd"
## [41] "constructioncd_encd"
## [42] "fire_risk_model_score"
## [43] "multipolicyind"
## [44] "multipolicyindumbrella"
## [45] "earthquakeumbrellaind"
## [46] "usagetype"
## [47] "usagetype_encd"
## [48] "ordinanceorlawpct"
## [49] "functionalreplacementcost"
## [50] "homegardcreditind"
## [51] "sprinklersystem"
## [52] "landlordind"
## [53] "rentersinsurance"
## [54] "firealarmtype"
## [55] "burglaryalarmtype"
## [56] "waterdetectiondevice"
## [57] "neighborhoodcrimewatchind"
## [58] "propertymanager"
## [59] "safeguardplusind"
## [60] "kitchenfireextinguisherind"
## [61] "gatedcommunityind"
## [62] "deadboltind"
## [63] "poolind"
## [64] "replacementcostdwellingind"
## [65] "replacementvalueind"
## [66] "serviceline"
## [67] "equipmentbreakdown"
## [68] "numberoffamilies"
## [69] "insuredage"
## [70] "maritalstatus"
## [71] "insurancescore"
## [72] "overriddeninsurancescore"
## [73] "insurancescorevalue"
## [74] "insscoretiervalueband"
## [75] "financialstabilitytier"
## [76] "allcov_wp"
## [77] "cova_wp"
## [78] "cova_ep"
## [79] "cova_deductible"
## [80] "log_cova_deductible"
## [81] "cova_limit"
## [82] "log_cova_limit"
## [83] "cova_ic_nc_water"
## [84] "hasclaim"
## [85] "cova_il_nc_water"
## [86] "log_cova_il_nc_water"
## [87] "water_risk_3_blk"
## [88] "log_water_risk_3_blk"
## [89] "water_risk_fre_3_blk"
## [90] "log_water_risk_fre_3_blk"
## [91] "water_risk_sev_3_blk"
## [92] "log_water_risk_sev_3_blk"
## [93] "appl_fail_3_blk"
## [94] "fixture_leak_3_blk"
## [95] "pipe_froze_3_blk"
## [96] "plumb_leak_3_blk"
## [97] "rep_cost_3_blk"
## [98] "ustructure_fail_3_blk"
## [99] "waterh_fail_3_blk"
## [100] "loaddate"
## [101] "customer_cnt_active_policies"
## [102] "customer_cnt_active_policies_binned"
dataset <- dataset[dataset$cova_il_nc_water>0,]
str(dataset)
## 'data.frame': 14141 obs. of 102 variables:
## $ modeldata_id : int 369477 1770375 1102527 307428 222322 982521 208890 822803 1171971 368993 ...
## $ systemidstart : int 786325 7261637 3342569 555817 365732 2718577 354808 2184406 3877974 785617 ...
## $ systemidend : int 786325 7261637 3342569 555817 365732 2718577 354808 2184406 3877974 785617 ...
## $ cal_year : int 2011 2019 2016 2011 2011 2016 2010 2014 2017 2012 ...
## $ startdate : Factor w/ 4135 levels "2009-01-01","2009-01-02",..: 731 3653 2557 912 731 2557 397 1895 2923 1096 ...
## $ enddate : Factor w/ 4104 levels "2009-01-09 00:00:00",..: 733 3741 2856 1063 923 2619 698 2159 2957 1403 ...
## $ startdatetm : Factor w/ 22990 levels "2008-01-09 00:00:00",..: 1998 19948 13788 4786 2981 12582 1979 10633 14314 5871 ...
## $ enddatetm : Factor w/ 16232 levels "2009-01-09 00:00:00",..: 2179 13966 9995 4196 2908 9054 2164 7732 10430 4810 ...
## $ ecy : num 0.0958 0.3312 0.9089 0.5037 0.616 ...
## $ log_ecy : num -2.3455 -1.105 -0.0955 -0.6858 -0.4845 ...
## $ policynumber : Factor w/ 243197 levels "AZF0082147","AZF0221975",..: 1093 2338 2387 23585 23503 23763 22523 1161 29650 1628 ...
## $ policy_uniqueid : int 365451 1523478 968090 290101 193025 869673 185822 717770 1005539 365024 ...
## $ policyterm : int 1 8 5 3 2 7 2 5 8 2 ...
## $ policytype : Factor w/ 2 levels "New","Renewal": 1 2 2 2 2 2 2 2 2 2 ...
## $ effectivedate : Factor w/ 4178 levels "2008-01-09","2008-01-14",..: 444 3621 2566 955 634 2329 440 1938 2668 1113 ...
## $ expirationdate : Factor w/ 4299 levels "2009-01-09","2009-01-14",..: 565 3742 2688 1077 755 2451 561 2059 2789 1235 ...
## $ policystate : Factor w/ 3 levels "AZ","CA","NV": 1 1 1 2 2 2 2 1 2 1 ...
## $ policyform : Factor w/ 9 levels "DF1","DF3","DF6",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ persistency : int 0 7 4 15 14 18 13 4 7 1 ...
## $ companycd : int 1 1 1 1 1 1 1 1 17 1 ...
## $ carriercd : Factor w/ 2 levels "CSEICO","CSESG": 1 1 1 1 1 1 1 1 2 1 ...
## $ agency_group : Factor w/ 605 levels "","1ST CENTURY INS SVCS INC.",..: 260 459 39 515 216 455 158 533 568 90 ...
## $ producername : Factor w/ 1267 levels "1ST CENTURY INS SVCS INC.",..: 443 999 313 1098 122 993 766 1143 1201 142 ...
## $ territory : Factor w/ 10 levels "","AZ-A","AZ-T",..: 2 3 2 7 6 7 5 2 6 2 ...
## $ risknumber : int 1 1 1 1 1 1 1 1 1 1 ...
## $ risktype : Factor w/ 2 levels "Dwelling","Homeowners": 1 1 1 1 1 1 1 1 1 1 ...
## $ yearbuilt : int 1991 1993 1993 1953 1966 1935 1945 1984 1930 1985 ...
## $ log_yearbuilt : num 7.6 7.6 7.6 7.58 7.58 ...
## $ sqft : int 2500 1600 1400 3100 1500 900 1200 3400 2200 1100 ...
## $ log_sqft : num 7.84 7.38 7.24 8.04 7.37 ...
## $ stories : int 1 1 1 1 1 1 1 1 1 1 ...
## $ roofcd : Factor w/ 7 levels "COMPO","MEMBRANE",..: 6 6 6 1 1 1 1 1 1 6 ...
## $ roofcd_encd : int 7 7 7 8 8 8 8 8 8 7 ...
## $ units : int 1 1 1 1 1 1 1 4 3 1 ...
## $ occupancycd : Factor w/ 3 levels "NO","OCCUPIEDNOW",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ occupancy_encd : int 2 2 2 2 2 2 2 2 2 2 ...
## $ allperilded : int 1000 1000 500 250 500 250 250 1000 1000 1000 ...
## $ waterded : int 0 0 0 0 0 0 0 0 0 0 ...
## $ protectionclass : int 3 4 6 6 2 4 3 3 2 2 ...
## $ constructioncd : Factor w/ 5 levels "AF","B","F","M",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ constructioncd_encd : int 5 5 5 5 5 5 5 5 5 5 ...
## $ fire_risk_model_score : int 0 0 0 0 0 0 0 0 0 0 ...
## $ multipolicyind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ multipolicyindumbrella : int 0 0 0 0 0 0 0 0 0 0 ...
## $ earthquakeumbrellaind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ usagetype : Factor w/ 7 levels "COC","PRIMARY",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ usagetype_encd : int 6 6 6 6 6 6 6 6 6 6 ...
## $ ordinanceorlawpct : int 10 10 10 10 10 10 10 10 10 10 ...
## $ functionalreplacementcost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ homegardcreditind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sprinklersystem : int 0 0 0 0 0 0 0 0 0 0 ...
## $ landlordind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rentersinsurance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ firealarmtype : int 0 1 0 0 0 0 0 0 1 0 ...
## $ burglaryalarmtype : int 0 1 0 0 0 0 0 0 1 0 ...
## $ waterdetectiondevice : int 0 0 0 0 0 0 0 0 0 0 ...
## $ neighborhoodcrimewatchind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ propertymanager : int 0 0 0 0 0 0 0 0 0 0 ...
## $ safeguardplusind : int 0 0 0 0 0 0 0 0 1 0 ...
## $ kitchenfireextinguisherind : int 1 1 0 0 0 0 0 1 0 1 ...
## $ gatedcommunityind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ deadboltind : int 1 1 1 0 0 0 0 1 0 1 ...
## $ poolind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ replacementcostdwellingind : int 1 1 1 0 0 0 1 0 1 0 ...
## $ replacementvalueind : int 0 0 0 0 0 0 0 0 0 0 ...
## $ serviceline : int 0 0 0 0 0 0 0 0 0 0 ...
## $ equipmentbreakdown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ numberoffamilies : int 1 1 1 1 1 1 1 4 3 1 ...
## $ insuredage : int 58 60 57 NA NA NA NA 37 NA 36 ...
## $ maritalstatus : Factor w/ 5 levels "~","Divorced",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insurancescore : Factor w/ 3836 levels "(DOES","~","610",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ overriddeninsurancescore : Factor w/ 41 levels "~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insurancescorevalue : Factor w/ 92 levels "~","630","645",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ insscoretiervalueband : Factor w/ 22 levels "~","624-632",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ financialstabilitytier : Factor w/ 42 levels "","~","01","02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ allcov_wp : num 638 465 502 2290 630 ...
## $ cova_wp : num 565 323 448 2290 573 664 419 811 881 309 ...
## $ cova_ep : num 54.1 107 407.2 1153.5 353 ...
## $ cova_deductible : int 1000 1000 500 250 500 250 250 1000 1000 1000 ...
## $ log_cova_deductible : num 6.91 6.91 6.21 5.52 6.21 ...
## $ cova_limit : int 300000 200000 200000 900000 300000 300000 200000 400000 400000 200000 ...
## $ log_cova_limit : num 12.5 12.2 12.1 13.7 12.5 ...
## $ cova_ic_nc_water : int 1 1 1 1 1 1 1 1 1 1 ...
## $ hasclaim : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cova_il_nc_water : num 6298 5457 14918 1140 1863 ...
## $ log_cova_il_nc_water : num 8.75 8.6 9.61 7.04 7.53 ...
## $ water_risk_3_blk : int 270 238 164 76 121 104 205 162 145 242 ...
## $ log_water_risk_3_blk : num 5.6 5.47 5.1 4.33 4.8 ...
## $ water_risk_fre_3_blk : int 241 229 184 51 116 77 180 236 135 197 ...
## $ log_water_risk_fre_3_blk : num 5.48 5.43 5.21 3.93 4.75 ...
## $ water_risk_sev_3_blk : int 115 107 92 153 107 138 117 71 111 126 ...
## $ log_water_risk_sev_3_blk : num 4.74 4.67 4.52 5.03 4.67 ...
## $ appl_fail_3_blk : int 5 5 5 3 5 5 1 5 5 5 ...
## $ fixture_leak_3_blk : int 1 4 4 2 2 2 1 1 3 1 ...
## $ pipe_froze_3_blk : int 0 0 2 2 2 2 2 0 2 0 ...
## $ plumb_leak_3_blk : int 5 5 4 1 1 3 4 1 1 5 ...
## $ rep_cost_3_blk : int 4 4 1 1 5 5 5 4 5 5 ...
## $ ustructure_fail_3_blk : int 5 5 5 4 5 5 5 5 5 5 ...
## $ waterh_fail_3_blk : int 1 4 4 0 0 0 0 0 0 1 ...
## [list output truncated]
summary(dataset)
## modeldata_id systemidstart systemidend cal_year
## Min. : 179 Min. : 745 Min. : 745 Min. :2009
## 1st Qu.: 536203 1st Qu.:1225297 1st Qu.:1356851 1st Qu.:2013
## Median :1007917 Median :2844382 Median :2943304 Median :2015
## Mean :1009292 Mean :3440017 Mean :3579325 Mean :2015
## 3rd Qu.:1500796 3rd Qu.:5514105 3rd Qu.:5754124 3rd Qu.:2018
## Max. :2102605 Max. :9136529 Max. :9610884 Max. :2020
##
## startdate enddate startdatetm
## 2019-01-01: 986 2019-01-01 00:00:00: 915 2016-10-01 00:00:00: 16
## 2018-01-01: 854 2017-01-01 00:00:00: 747 2018-05-01 00:00:00: 16
## 2017-01-01: 705 2018-01-01 00:00:00: 747 2018-08-17 00:00:00: 15
## 2016-01-01: 694 2020-01-01 00:00:00: 658 2018-02-28 00:00:00: 14
## 2013-01-01: 642 2015-01-01 00:00:00: 652 2016-10-15 00:00:00: 13
## 2014-01-01: 632 2014-01-01 00:00:00: 624 2018-09-29 00:00:00: 13
## (Other) :9628 (Other) :9798 (Other) :14054
## enddatetm ecy log_ecy
## 2019-05-01 00:00:00: 19 Min. :0.0054 Min. :-5.221356
## 2019-08-17 00:00:00: 18 1st Qu.:0.4380 1st Qu.:-0.825536
## 2017-10-01 00:00:00: 17 Median :0.6516 Median :-0.428324
## 2014-05-01 00:00:00: 16 Mean :0.6239 Mean :-0.581713
## 2019-07-01 00:00:00: 16 3rd Qu.:0.8295 3rd Qu.:-0.186932
## 2014-10-19 00:00:00: 15 Max. :1.0020 Max. : 0.001998
## (Other) :14040
## policynumber policy_uniqueid policyterm policytype
## CAH6012203: 12 Min. : 114 Min. : 1.000 New : 2906
## NVH0643400: 9 1st Qu.: 483369 1st Qu.: 2.000 Renewal:11235
## CAH0751641: 8 Median : 887544 Median : 3.000
## CAH0752785: 8 Mean : 877829 Mean : 4.066
## CAH0755516: 8 3rd Qu.:1295306 3rd Qu.: 6.000
## CAH0755561: 8 Max. :1791878 Max. :12.000
## (Other) :14088
## effectivedate expirationdate policystate policyform
## 2018-08-17: 20 2019-08-17: 20 AZ: 1210 HO3 :9617
## 2018-05-01: 19 2019-05-01: 19 CA:12372 DF3 :3427
## 2016-10-01: 18 2017-10-01: 18 NV: 559 DF6 : 596
## 2013-10-19: 17 2014-10-19: 17 Form3 : 408
## 2016-10-15: 16 2017-10-15: 16 FL3-Special: 68
## 2017-08-01: 16 2018-08-01: 16 FL2-Broad : 17
## (Other) :14035 (Other) :14035 (Other) : 8
## persistency companycd carriercd
## Min. : 0.000 Min. : 1.00 CSEICO:5540
## 1st Qu.: 1.000 1st Qu.: 1.00 CSESG :8601
## Median : 4.000 Median :17.00
## Mean : 6.425 Mean :11.53
## 3rd Qu.: 9.000 3rd Qu.:17.00
## Max. :44.000 Max. :19.00
##
## agency_group
## WESTERN GOLD INS AGCY INC. : 949
## CRUSBERG DECKER INS SVCS INC : 621
## Acrisure of California : 591
## J.E. BROWN and ASSOCS INS SVCS : 560
## ISU INSURANCE SERVICES OF SAN FRANCISCO INC: 545
## PIIB - PACIFIC INTERSTATE INS : 379
## (Other) :10496
## producername territory risknumber
## WESTERN GOLD INS AGCY INC. : 926 CA-B :4481 Min. :1
## J.E. BROWN and ASSOCS INS SVCS: 537 CA-C :3697 1st Qu.:1
## CRUSBERG DECKER INS SVCS INC : 397 CA-A :1839 Median :1
## Acrisure of California LLC : 314 CA-O :1472 Mean :1
## CALIFORNIA INS SPECIALISTS : 223 AZ-A :1053 3rd Qu.:1
## HUB INTERNATIONAL INS SVCS : 218 CA-T : 965 Max. :2
## (Other) :11526 (Other): 634
## risktype yearbuilt log_yearbuilt sqft
## Dwelling : 4116 Min. :1900 Min. :7.533 Min. : 800
## Homeowners:10025 1st Qu.:1972 1st Qu.:7.587 1st Qu.:1500
## Median :1987 Median :7.594 Median :1900
## Mean :1982 Mean :7.592 Mean :2085
## 3rd Qu.:1996 3rd Qu.:7.599 3rd Qu.:2500
## Max. :2019 Max. :7.610 Max. :5000
##
## log_sqft stories roofcd roofcd_encd
## Min. :6.397 Min. :1.000 COMPO :5071 Min. :1.000
## 1st Qu.:7.315 1st Qu.:1.000 MEMBRANE: 180 1st Qu.:7.000
## Median :7.579 Median :1.000 METAL : 39 Median :7.000
## Mean :7.586 Mean :1.235 OTHER :1943 Mean :7.062
## 3rd Qu.:7.857 3rd Qu.:1.000 TAR : 260 3rd Qu.:8.000
## Max. :9.051 Max. :3.000 TILE :6493 Max. :8.000
## WOOD : 155
## units occupancycd occupancy_encd allperilded
## Min. :1.000 NO : 0 Min. :1.000 Min. : 100
## 1st Qu.:1.000 OCCUPIEDNOW:13479 1st Qu.:1.000 1st Qu.: 500
## Median :1.000 TENANT : 662 Median :1.000 Median : 1000
## Mean :1.083 Mean :1.047 Mean : 1144
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.: 1000
## Max. :4.000 Max. :2.000 Max. :10000
##
## waterded protectionclass constructioncd constructioncd_encd
## Min. : 0.00 Min. : 0.000 AF :4886 Min. :1.000
## 1st Qu.: 0.00 1st Qu.: 2.000 B : 187 1st Qu.:4.000
## Median : 0.00 Median : 3.000 F :8731 Median :5.000
## Mean : 32.18 Mean : 3.116 M : 197 Mean :4.543
## 3rd Qu.: 0.00 3rd Qu.: 4.000 OTHER: 140 3rd Qu.:5.000
## Max. :10000.00 Max. :10.000 Max. :5.000
##
## fire_risk_model_score multipolicyind multipolicyindumbrella
## Min. :-1.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.000000
## Median : 0.0000 Median :0.0000 Median :0.000000
## Mean : 0.3155 Mean :0.1882 Mean :0.004172
## 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :12.0000 Max. :1.0000 Max. :1.000000
##
## earthquakeumbrellaind usagetype usagetype_encd ordinanceorlawpct
## Min. :0.000000 COC : 4 Min. :2.00 Min. : 0.000
## 1st Qu.:0.000000 PRIMARY :10222 1st Qu.:6.00 1st Qu.: 0.000
## Median :0.000000 RENTAL : 3859 Median :7.00 Median : 10.000
## Mean :0.002899 SEASONAL : 46 Mean :6.71 Mean : 8.929
## 3rd Qu.:0.000000 SECONDARY : 8 3rd Qu.:7.00 3rd Qu.: 10.000
## Max. :1.000000 UNOCCUPIED: 0 Max. :7.00 Max. :100.000
## VACANT : 2
## functionalreplacementcost homegardcreditind sprinklersystem
## Min. :0.0000000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000000 Median :0.0000 Median :0.00000
## Mean :0.0007779 Mean :0.2037 Mean :0.03027
## 3rd Qu.:0.0000000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000000 Max. :1.0000 Max. :1.00000
##
## landlordind rentersinsurance firealarmtype burglaryalarmtype
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :1.0000 Median :0.0000
## Mean :0.03097 Mean :0.00396 Mean :0.6185 Mean :0.3202
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## waterdetectiondevice neighborhoodcrimewatchind propertymanager
## Min. :0.0000000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000000 Median :0.0000 Median :0.00000
## Mean :0.0002121 Mean :0.0203 Mean :0.01047
## 3rd Qu.:0.0000000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000000 Max. :1.0000 Max. :1.00000
##
## safeguardplusind kitchenfireextinguisherind gatedcommunityind
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.3677 Mean :0.4544 Mean :0.01824
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## deadboltind poolind replacementcostdwellingind
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.7038 Mean :0.05785 Mean :0.2305
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## replacementvalueind serviceline equipmentbreakdown numberoffamilies
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :1.000
## Mean :0.02447 Mean :0.1227 Mean :0.1326 Mean :1.083
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :4.000
##
## insuredage maritalstatus insurancescore overriddeninsurancescore
## Min. : 0.00 ~ :6462 ~ :13373 ~ :12898
## 1st Qu.: 45.00 Divorced: 84 99 : 6 99 : 428
## Median : 55.00 Married :3954 ZQWFV : 5 07 : 86
## Mean : 55.09 Single :3484 DWKDS : 4 12 : 66
## 3rd Qu.: 65.00 Widowed : 157 FWKVS : 4 13 : 58
## Max. :114.00 USXQD : 4 04 : 54
## NA's :1273 (Other): 745 (Other): 551
## insurancescorevalue insscoretiervalueband financialstabilitytier
## ~ :14137 ~ :14137 :13376
## 925 : 2 714-731: 2 ~ : 684
## 729 : 1 894-945: 2 14 : 9
## 731 : 1 624-632: 0 07 : 8
## 630 : 0 640-648: 0 13 : 7
## 645 : 0 656-664: 0 06 : 6
## (Other): 0 (Other): 0 (Other): 51
## allcov_wp cova_wp cova_ep cova_deductible
## Min. : 48 Min. : 9 Min. : 1.428 Min. : 100
## 1st Qu.: 656 1st Qu.: 635 1st Qu.: 318.021 1st Qu.: 500
## Median : 907 Median : 959 Median : 565.492 Median : 1000
## Mean :1021 Mean :1164 Mean : 729.635 Mean : 1144
## 3rd Qu.:1230 3rd Qu.:1492 3rd Qu.: 952.465 3rd Qu.: 1000
## Max. :7588 Max. :9468 Max. :8631.976 Max. :10000
##
## log_cova_deductible cova_limit log_cova_limit cova_ic_nc_water
## Min. :4.605 Min. : 100000 Min. : 8.372 Min. :1.000
## 1st Qu.:6.215 1st Qu.: 300000 1st Qu.:12.488 1st Qu.:1.000
## Median :6.908 Median : 400000 Median :12.804 Median :1.000
## Mean :6.862 Mean : 451291 Mean :12.769 Mean :1.021
## 3rd Qu.:6.908 3rd Qu.: 500000 3rd Qu.:13.117 3rd Qu.:1.000
## Max. :9.210 Max. :1300000 Max. :14.644 Max. :3.000
##
## hasclaim cova_il_nc_water log_cova_il_nc_water water_risk_3_blk
## Min. :1 Min. : 0.9 Min. :-0.1054 Min. : 23.0
## 1st Qu.:1 1st Qu.: 2953.0 1st Qu.: 7.9906 1st Qu.: 158.0
## Median :1 Median : 7088.9 Median : 8.8663 Median : 208.0
## Mean :1 Mean : 13549.9 Mean : 8.7756 Mean : 226.7
## 3rd Qu.:1 3rd Qu.: 15966.8 3rd Qu.: 9.6783 3rd Qu.: 270.0
## Max. :1 Max. :522735.2 Max. :13.1668 Max. :1056.0
##
## log_water_risk_3_blk water_risk_fre_3_blk log_water_risk_fre_3_blk
## Min. :3.135 Min. : 21.0 Min. :3.045
## 1st Qu.:5.063 1st Qu.: 124.0 1st Qu.:4.820
## Median :5.338 Median : 174.0 Median :5.159
## Mean :5.328 Mean : 184.5 Mean :5.115
## 3rd Qu.:5.598 3rd Qu.: 221.0 3rd Qu.:5.398
## Max. :6.962 Max. :1044.0 Max. :6.951
##
## water_risk_sev_3_blk log_water_risk_sev_3_blk appl_fail_3_blk
## Min. : 35.0 Min. :3.555 Min. :0.000
## 1st Qu.:109.0 1st Qu.:4.691 1st Qu.:4.000
## Median :129.0 Median :4.860 Median :5.000
## Mean :130.6 Mean :4.845 Mean :4.257
## 3rd Qu.:150.0 3rd Qu.:5.011 3rd Qu.:5.000
## Max. :265.0 Max. :5.580 Max. :5.000
##
## fixture_leak_3_blk pipe_froze_3_blk plumb_leak_3_blk rep_cost_3_blk
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:5.000
## Median :2.000 Median :2.000 Median :4.000 Median :5.000
## Mean :1.894 Mean :1.588 Mean :3.363 Mean :4.825
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
##
## ustructure_fail_3_blk waterh_fail_3_blk loaddate
## Min. :0.000 Min. :0.000 2020-07-29 05:01:19.56:14141
## 1st Qu.:4.000 1st Qu.:0.000
## Median :5.000 Median :2.000
## Mean :4.313 Mean :1.333
## 3rd Qu.:5.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000
##
## customer_cnt_active_policies customer_cnt_active_policies_binned
## Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 1.000 Median : 1.000
## Mean : 1.222 Mean : 1.767
## 3rd Qu.: 1.000 3rd Qu.: 1.000
## Max. :28.000 Max. :30.000
##
ggplot(dataset, aes(x = .data[['cova_il_nc_water']], color="red",fill="#56B4E9")) +
geom_histogram(bins=100) +
labs(x = 'cova_il_nc_water', y = 'Count', title = "Histogram of Losses (cova_il_nc_water)")+
scale_color_manual(values=c("red")) +
scale_fill_manual(values=c("#56B4E9")) +
theme(legend.position = "none")
ggplot(dataset, aes(x = .data[['cova_il_nc_water']])) +
geom_density() +
labs(x = 'cova_il_nc_water', y = 'Density', title = "Density of Losses (cova_il_nc_water)")
vec <- dataset$cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'cova_il_nc_water', title = "QQ Plot of Losses (cova_il_nc_water)")
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']], color="red",fill="#56B4E9")) +
geom_histogram(bins=100) +
scale_color_manual(values=c("red")) +
scale_fill_manual(values=c("#56B4E9")) +
labs(x = 'log(cova_il_nc_water)', y = 'Count', title = "Histogram of Losses (log_cova_il_nc_water)")+
theme(legend.position = "none")
ggplot(dataset, aes(x = .data[['log_cova_il_nc_water']])) +
geom_density() +
labs(x = 'log(cova_il_nc_water)', y = 'Density', title = 'Density of Losses (log_cova_il_nc_water)')
vec <- dataset$log_cova_il_nc_water
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
ggplot(dataset, aes(sample = .data[['log_cova_il_nc_water']], col='red')) +
stat_qq() +
geom_abline(slope = slope, intercept = int) +
theme(legend.position = "none") +
labs(y = 'log(cova_il_nc_water)', title = 'QQ Plot of Losses (log_cova_il_nc_water)')
Looks like gamma distribution and log conversion is more close to normal.
plot_categorical <- function(col_name) {
#dataset[[col_name]] <- as.factor(dataset[[col_name]])
#ggplot(dataset, aes(x=.data[[col_name]], y=cova_il_nc_water, col=.data[[col_name]], fill=.data[[col_name]])) +
# geom_boxplot(notch=notch) +
# theme(legend.position = "none") +
# labs(x = col_name, title = paste("Box Plot of Losses and ", col_name))
if ( col_name == 'roofcd_encd') {
col_name_original <- 'roofcd'
}
else if ( col_name == 'usagetype_encd' ) {
col_name_original <- 'usagetype'
}
else if ( col_name == 'constructioncd_encd' ) {
col_name_original <- 'constructioncd'
}
else if ( col_name == 'occupancy_encd' ) {
col_name_original <- 'occupancycd'
}
else {
col_name_original <- col_name
}
if (grepl('encd',col_name) |
col_name =='fire_risk_model_score' |
col_name =='customer_cnt_active_policies_binned' |
col_name =='cova_deductible' |
col_name =='cova_limit' |
col_name =='protectionclass' |
col_name =='ordinanceorlawpct' |
col_name =='numberoffamilies' |
col_name =='waterded' |
col_name =='units' |
col_name =='sprinklersystem' |
col_name =='stories') {
XBreaks <- as.vector(unlist(unique(dataset[c(col_name)])))
XLabels <- as.vector(unlist(unique(dataset[c(col_name_original)])))
}
else if (col_name =='pipe_froze_3_blk' |
col_name =='water_risk_3_blk' |
col_name =='ustructure_fail_3_blk' |
col_name =='water_risk_fre_3_blk' |
col_name =='waterh_fail_3_blk' |
col_name =='rep_cost_3_blk' |
col_name =='plumb_leak_3_blk' |
col_name =='appl_fail_3_blk' |
col_name =='fixture_leak_3_blk') {
XBreaks <- c(0,1,2,3,4,5)
XLabels <- c('Low', 'Elevated', 'Below Avg', 'Average', 'High', 'Highest')
}
else {
XBreaks <- c(0,1)
XLabels <- c("No", "Yes")
}
pd_col <- pd_gamma_dataset[pd_gamma_dataset$feature == col_name,][c('value','pd')]
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
group_by(.dots = col_name) %>%
dplyr::summarize(n = n(),
mean = mean(cova_il_nc_water),
sd = sd(cova_il_nc_water),
se = sd / sqrt(n) #,
#ci = qt(0.975, df = n - 1) * sd / sqrt(n)
)
plotdata <- merge(plotdata,pd_col,by=col_name)
cols <- c('Mean Losses with standard error'='#f04546','Losses Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p1)
print(p2)
#----------------------
pd_col <- pd_normal_dataset[pd_normal_dataset$feature == col_name,][c('value','pd')]
if (nrow(pd_col)>0) {
colnames(pd_col) <- c(col_name,'pd')
plotdata <- dataset %>%
group_by(.dots = col_name) %>%
dplyr::summarize(n = n(),
mean = mean(log_cova_il_nc_water),
sd = sd(log_cova_il_nc_water),
se = sd / sqrt(n) #,
#ci = qt(0.975, df = n - 1) * sd / sqrt(n)
)
plotdata <- merge(plotdata,pd_col,by=col_name)
cols <- c('Mean Losses (Log) with standard error'='#f04546','Losses (Log) Partial Dependency'='#3591d1')
p1 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = mean,group = 1,color='Mean Losses (Log) with standard error')) +
geom_errorbar(aes(x = .data[[col_name]], y = mean,group = 1, ymin = mean - se,ymax = mean + se,color='Mean Losses (Log) with standard error'),width = .1) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
p2 = ggplot(plotdata) +
scale_fill_manual(values=cols) +
scale_colour_manual(values=cols) +
geom_point(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency'), size = 3) +
geom_line(aes(x = .data[[col_name]], y = pd,group = 1,color='Losses (Log) Partial Dependency')) +
scale_x_continuous( breaks=XBreaks, labels= XLabels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
print(p1)
print(p2)
}
}
plot_categorical('functionalreplacementcost')
plot_categorical('numberoffamilies')
plot_categorical('units')
plot_categorical('stories')
plot_categorical('cova_deductible')
plot_categorical('replacementvalueind')
plot_categorical('neighborhoodcrimewatchind')
plot_categorical('fire_risk_model_score')
ggplot(data=dataset, mapping = aes(x=water_risk_sev_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs water_risk_sev_3_blk")
The higest losses are between 100 and 200 water_risk_sev_3_blk. We may have not enough data for higher numbers of water_risk_sev_3_blk
plot_categorical('burglaryalarmtype')
plot_categorical('deadboltind')
plot_categorical('waterh_fail_3_blk')
plot_categorical('safeguardplusind')
plot_categorical('equipmentbreakdown')
plot_categorical('cova_limit')
ggplot(data=dataset, mapping = aes(x=water_risk_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs water_risk_3_blk")
There are more higher losses around 250 water_risk_3_blk
plot_categorical('usagetype_encd')
plot_categorical('multipolicyindumbrella')
plot_categorical('constructioncd_encd')
plot_categorical('replacementcostdwellingind')
plot_categorical('roofcd_encd')
plot_categorical('gatedcommunityind')
plot_categorical('homegardcreditind')
plot_categorical('ustructure_fail_3_blk')
ggplot(data=dataset, mapping = aes(x=water_risk_fre_3_blk, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs water_risk_fre_3_blk")
plot_categorical('ecy')
ggplot(data=dataset, mapping = aes(x=ecy, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs ecy")
plot_categorical('appl_fail_3_blk')
plot_categorical('pipe_froze_3_blk')
plot_categorical('firealarmtype')
plot_categorical('fixture_leak_3_blk')
plot_categorical('sqft')
ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs sqft")
Severity is higher in larger properties
plot_categorical('yearbuilt')
ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs YearBuilt")
Severity is higher in newer properties
plot_categorical('customer_cnt_active_policies_binned')
ggplot(data=dataset, mapping = aes(x=customer_cnt_active_policies_binned, y=cova_il_nc_water, color=-cova_il_nc_water)) +
geom_point() +
labs(title = "Losses vs customer_cnt_active_policies_binned")
plot_categorical('protectionclass')
plot_categorical('multipolicyind')
plot_categorical('rep_cost_3_blk')
plot_categorical('plumb_leak_3_blk')
plot_categorical('rentersinsurance')
plot_categorical('sprinklersystem')
plot_categorical('kitchenfireextinguisherind')
plot_categorical('poolind')
plot_categorical('landlordind')
plot_categorical('occupancy_encd')
plot_categorical('ordinanceorlawpct')
plot_categorical('waterded')
plot_categorical('serviceline')
plot_categorical('propertymanager')
plot_categorical('earthquakeumbrellaind')
ggplot(data=dataset, mapping = aes(x=yearbuilt, y=cova_il_nc_water, color=roofcd)) +
geom_point() +
labs(title = "Losses vs YearBuilt and Roofcd Limit")
ggplot(data=dataset, mapping = aes(x=sqft, y=cova_il_nc_water, color=-cova_limit)) +
geom_point() +
labs(title = "Losses vs Sqft and CovA Limit")